'''
coding: utf-8
@Software: PyCharm
@Author: JiangDa
@File: DataClear.py
@Time: 2023/8/22 022 9:42
@Description: 数据清洗
'''

import pandas as pd
import re


# 处理薪资字段
def data_clear(filename):
    # df = pd.read_csv(filename, index_col=0)  # 指定第一行为列名
    df = pd.read_csv(filename, encoding='utf-8')
    # print(df.head())
    # print(type(df))
    # a = df.loc[[0], ['薪资']].values.tolist()[0][0]
    # print(a)

    # 一行一行的处理薪资
    for i in range(0, df.shape[0]):

        s = df.loc[[i], ['薪资']].values.tolist()[0][0]

        if re.search('(.*)-(.*)', s):
            a = re.search('(.*)-(.*)', s).group(1)
            if a[-1] == '千':
                a = eval(a[0:-1]) * 1000
            elif a[-1] == '万':
                a = eval(a[0:-1]) * 10000
            b = re.search('(.*)-(.*)', s).group(2)
            if b[-1] == '千':
                b = eval(b[0:-1]) * 1000
            elif b[-1] == '万':
                b = eval(b[0:-1]) * 10000
            s = (a + b) / 2
            df.loc[[i], ['薪资']] = s
        else:
            df.loc[[i], ['薪资']] = ''

    # os.remove(filename)
    # 处理后转储的CSV文件
    df.to_csv('java2.csv')


if __name__ == "__main__":
    # 指定读取文件
    filename = 'D:/project/BS/zhilian/spider/java.csv'
    data_clear(filename)
